*************************************************
* Purpose -- Preparing different samples for 
* analysis using QCEW data
************************************************
clear all
* RUN 00_path_master.do FIRST TO GET FILEPATHS


use "$clean/qcew/qcew_annual_clean_county.dta", clear
sort countyfip year, stable
xtset countyfips year

************************************************
* CZ-state dataset
************************************************

frame copy default cz_state
frame change cz_state

sum pop if year == 2019, d
local county_pop_total = `r(sum)' //this is to get proportion of total pop covered

* keeping only balanced sample
keep if balanced_full_75 == 1

sum pop if year == 2019, d
local county_pop_balanced = `r(sum)'

* calculating share of population kept
global county_share = (`county_pop_balanced'/`county_pop_total')*100
global county_share: disp %4.2f $county_share

* calculating share of counties kept
distinct countyfips
global county_num_balanced = `r(ndistinct)'
global county_coverage = (${county_num_balanced}/3109)*100
global county_coverage: disp %3.2f $county_coverage

egen czstate = group(czone statefips)

* collapsing dataset to cz-state-year level
collapse (mean) mw fed_mw index czone statefips (sum) annual_avg_estabs70 annual_avg_emplvl70 ///
total_annual_wages70 annual_avg_estabs75 annual_avg_emplvl75 total_annual_wages75 ///
pop (first) statename, by(czstate year)

* creating basic variables
gen lemp75 = ln(annual_avg_emplvl75)
gen lmw = ln(mw)
gen lpop = ln(pop)

gen avg_earnings = (total_annual_wages75/annual_avg_emplvl75)/52.14 // average weekly earnings per worker
gen real_earnings = avg_earnings/index
gen learn75 = ln(real_earnings)

gen lemp_other = ln(annual_avg_emplvl70 - annual_avg_emplvl75)

gen avg_earnings_other = ((total_annual_wages70 - total_annual_wages75)/(annual_avg_emplvl70 - annual_avg_emplvl75))/52.14
gen real_earnings_other = avg_earnings_other/index
gen learn_other = ln(real_earnings_other)

xtset czstate year
tempfile czstate_year
save `czstate_year'

save "$clean/qcew/cz_state_sample.dta", replace


* creating another CZ-state dataset with all counties and only population data to be used later
frame copy default cz_state_pop
frame change cz_state_pop

egen czstate = group(czone statefips)

collapse (mean) czone statefips (sum) pop (first) statename, by(czstate year)

tempfile czstate_year_pop
save `czstate_year_pop'


************************************************
* MSCZ pairs dataset
************************************************
frame create mscz
frame change mscz

* using the next few lines of code to just get the pairs as per JNR
use "$raw/jnr-cbp/cbp_stacked_czonestatepair_sample.dta", clear

rename (czonestatepair state) (pair_id statefips)
egen pair_id_czstate = group(pair_id czonestate) 
sort pair_id_czstate year, stable
keep pair_id pair_id_czstate czone statefips year

* creating observations for 2017-2019 for merging
forvalues i = 2017/2019 {
	insobs 1
	replace year = `i' if _n == _N
}

fillin pair_id_czstate year
drop if mi(pair_id_czstate)
drop _fillin


* some years were missing in CBP data, and also pair_id and county will be missing for 1990-2019 data
* just copying these variables within a pair_id_county

foreach var in pair_id czone statefips {
	bys pair_id_czstate (`var'): replace `var' = `var'[1] if mi(`var')
}


foreach var of varlist _all {
	assert ~mi(`var')
}

* next few lines of code (within the two preserve-restores are used to calculate
* coverage in terms of pairs and in terms of population for this sample)
* we report this in the paper
preserve
	merge m:1 czone state year using `czstate_year_pop'
	keep if _merge == 3
	drop _merge
	
	duplicates drop czone statefips year, force
	sum pop if year == 2019, d
	local cz_pop_total = `r(sum)'
	
restore

preserve
	merge m:1 czone state year using `czstate_year'
	keep if _merge == 3
	drop _merge
	
	bys pair_id: gen obs = _N
	keep if obs == 60
	distinct pair_id
	global cz_num_balanced = `r(ndistinct)'
	global cz_coverage = (${cz_num_balanced}/151)*100
	global cz_coverage: disp %4.2f $cz_coverage
	duplicates drop czone statefips year, force
	sum pop if year == 2019, d
	local cz_pop_balanced = `r(sum)'
	global cz_share = (`cz_pop_balanced'/`cz_pop_total')*100
	global cz_share: disp %4.2f $cz_share
	
restore
	
* now merging JNR pairs with our CZ_state dataset	
merge m:1 czone state year using `czstate_year'

* only keeping CZ_state that are part of MSCZ pairs
keep if _merge == 3
drop _merge

* dropping pair_id_czstate for whom we do not have data for the other pair (mostly because of balanced data)
bys pair_id: gen obs = _N
keep if obs == 60

xtset pair_id_czstate year
egen pair_id_num = group(pair_id)

save "$clean/qcew/mscz_sample.dta", replace


************************************************
* Border county pairs dataset
************************************************
* starting from county-level dataset instead of CZ-state
frame copy default bcp
frame change bcp

* creating variables like before
gen lemp75 = ln(annual_avg_emplvl75)
gen lmw = ln(mw)
gen lpop = ln(pop)

gen avg_earnings = (total_annual_wages75/annual_avg_emplvl75)/52.14
gen real_earnings = avg_earnings/index
gen learn75 = ln(real_earnings)

gen lemp_other = ln(annual_avg_emplvl70 - annual_avg_emplvl75)

gen avg_earnings_other = ((total_annual_wages70 - total_annual_wages75)/(annual_avg_emplvl70 - annual_avg_emplvl75))/52.14
gen real_earnings_other = avg_earnings_other/index
gen learn_other = ln(real_earnings_other)

tempfile county_year
save `county_year'


* again getting county pairs from JNR's replication files
use "$raw/jnr-cbp/cbp_stacked_countypair_sample.dta", clear

frame copy bcp bcp_default
frame change bcp_default
* Keep only border counties
	merge m:1 countypair_id county using "$raw/jnr-cbp/countypairtypes.dta", nogen assert(3)
	keep if type0 == 1
drop type*


forvalues i = 1/4 {
	frame copy bcp bcp_ss`i'
	frame change bcp_ss`i'
	merge m:1 countypair_id county using "$raw/jnr-cbp/countypairtypes.dta", nogen
	keep if type`i' == 1
	drop type*
}

local f = 1
foreach frame in bcp_default bcp_ss1 bcp_ss2 bcp_ss3 bcp_ss4 {
	frame change `frame'

	rename (countypair_id county) (pair_id countyfips)
	egen pair_id_county = group(pair_id countyfips) 
	sort pair_id_county year, stable
	keep pair_id pair_id_county countyfips year

	forvalues i = 2017/2019 {
		insobs 1
		replace year = `i' if _n == _N
	}

	fillin pair_id_county year
	drop if mi(pair_id_county)
	drop _fillin

	* some years were missing in BCP data, and also pair_id and county will be missing for 1990-2019 data
	* just copying these variables within a pair_id_county

	bys pair_id_county (pair_id): replace pair_id = pair_id[_N] if mi(pair_id)
	bys pair_id_county (countyfips): replace countyfips = countyfips[1] if mi(countyfips)

	foreach var of varlist _all {
		assert ~mi(`var')
	}

	tempfile bcp
	save `bcp'

	*now merging JNR pairs with our county dataset
	merge m:1 countyfips year using `county_year'

	keep if _merge == 3
	drop _merge

	if `f' == 1 {
		preserve
			duplicates drop countyfips year, force
			sum pop if year == 2019, d
			local bcp_pop_total = `r(sum)'
		restore
	}
	

	keep if balanced_full_75 == 1

	* dropping counties for whom we do not have data for the other pair due to balanced pair
	bys pair_id: gen obs = _N
	keep if obs == 60

	if `f' == 1 {
		distinct pair_id
		global bcp_num_balanced = `r(ndistinct)'
		global bcp_coverage = (${bcp_num_balanced}/1165)*100
		global bcp_coverage: disp %4.2f $bcp_coverage

		preserve
			duplicates drop countyfips year, force
			sum pop if year == 2019, d
			local bcp_pop_balanced = `r(sum)'
			global bcp_share = (`bcp_pop_balanced'/`bcp_pop_total')*100
			global bcp_share: disp %4.2f $bcp_share
		restore
	}
	
	xtset pair_id_county year
	egen pair_id_num = group(pair_id)
	
	local ++f
}

frame change bcp_default
save "$clean/qcew/bcp_sample.dta", replace

forvalues i = 1/4 {
	frame change bcp_ss`i'
	save "$clean/qcew/bcp_subsample`i'.dta", replace
}


************************************************
* States dataset
************************************************

frame create state
frame change state
use "$clean/qcew/qcew_annual_clean_state.dta", replace

drop if year < 1990
xtset statefips year

* creating variables like before
gen lemp75 = ln(annual_avg_emplvl55)
gen lmw = ln(mw)
gen lpop = ln(pop)

gen avg_earnings = (total_annual_wages55/annual_avg_emplvl55)/52.14
gen real_earnings = avg_earnings/index
gen learn75 = ln(real_earnings)

gen lemp_other = ln(annual_avg_emplvl51 - annual_avg_emplvl55)

gen avg_earnings_other = ((total_annual_wages51 - total_annual_wages55)/(annual_avg_emplvl51 - annual_avg_emplvl55))/52.14
gen real_earnings_other = avg_earnings_other/index
gen learn_other = ln(real_earnings_other)


save "$clean/qcew/state_sample.dta", replace


************************************************
* Output coverage and pop share table
************************************************

matrix coverage = (3109, ${county_num_balanced}, ${county_coverage}, ${county_share} \ ///
				   151, ${cz_num_balanced}, ${cz_coverage}, ${cz_share} \ ///
				   1165, ${bcp_num_balanced}, ${bcp_coverage}, ${bcp_share})

matrix rownames coverage = "Counties" "MSCZ pairs" "Border county pairs"

esttab matrix(coverage) using "$tables/Table B1.tex", replace booktabs ///
mlabels(none) ///
prehead(\begin{table}[htbp]\centering ///
\def\sym#1{\ifmmode^{#1}\else\(^{#1}\)\fi} ///
\caption{Coverage and population shares of QCEW samples \label{tab:coverage}} ///
\begin{tabular*}{\hsize} ///
{@{\hskip\tabcolsep\extracolsep\fill}l*{7}{c}} ///
\toprule) ///
collabels("\shortstack{Total units\\(counties/pairs)}" "\shortstack{Units in sample\\(counties/pairs)}" ///
"\shortstack{Percentage of\\units in sample}" "\shortstack{Percentage of\\population covered}") ///
postfoot( ///
\hline\hline ///
\end{tabular*} ///
{\centering ///
\caption*{\begin{footnotesize} ///
This table shows the coverage of QCEW data by number of units (pairs/counties) and populations. Row 1 shows the balanced ///
QCEW sample sizes. Rows 2 and 3 show coverage for the complete pairs in the border county and MSCZ pairs regressions. ///
The first column reports the total number of units/pairs in JNR's sample. Samples exclude Alaska, Hawaii, and DC. ///
\end{footnotesize}}} ///
\end{table} ///
)